library(FactoMineR)
library(factoextra)
library(CASdatasets)
library(tidyverse)
library(MASS)
library(knitr)
library(ggplot2)
library(cowplot)
library(reshape2)
library(dplyr)
library(GGally)
library(corrplot)
library(carData)
library(car)
library(questionr)
library(multcomp)
library(dplyr)
library(leaps)
library(TeachingDemos)
library(FactoMineR)
library(factoextra)
library(ROCR)
library(plotROC)
#?CASdatasets #ouvre l'aide pour comprendre le dataset
data(freMPL5)
summary(freMPL5)
## Exposure LicAge RecordBeg RecordEnd
## Min. :0.001 Min. : 24 Min. :2004-01-01 Min. :2004-01-02
## 1st Qu.:0.170 1st Qu.:229 1st Qu.:2004-01-01 1st Qu.:2004-05-01
## Median :0.403 Median :355 Median :2004-04-15 Median :2004-07-18
## Mean :0.423 Mean :350 Mean :2004-05-07 Mean :2004-07-20
## 3rd Qu.:0.666 3rd Qu.:463 3rd Qu.:2004-08-11 3rd Qu.:2004-10-14
## Max. :1.000 Max. :844 Max. :2004-12-31 Max. :2004-12-31
## NA's :12818
## Gender MariStat SocioCateg VehUsage
## Female: 8856 Alone: 3461 CSP50 :12385 Private : 8576
## Male :17144 Other:22539 CSP60 : 5646 Private+trip to office:12163
## CSP55 : 3247 Professional : 4299
## CSP1 : 960 Professional run : 962
## CSP66 : 672
## CSP42 : 655
## (Other): 2435
## DrivAge HasKmLimit ClaimAmount ClaimNbResp
## Min. :20.00 Min. :0.00000 Min. :-1842.0 Min. :0.0000
## 1st Qu.:39.00 1st Qu.:0.00000 1st Qu.: 0.0 1st Qu.:0.0000
## Median :51.00 Median :0.00000 Median : 0.0 Median :0.0000
## Mean :50.45 Mean :0.07462 Mean : 204.9 Mean :0.2663
## 3rd Qu.:61.00 3rd Qu.:0.00000 3rd Qu.: 0.0 3rd Qu.:0.0000
## Max. :95.00 Max. :1.00000 Max. :95151.0 Max. :4.0000
##
## ClaimNbNonResp ClaimNbParking ClaimNbFireTheft ClaimNbWindscreen
## Min. :0.0000 Min. :0.00000 Min. :0.00000 Min. :0.000
## 1st Qu.:0.0000 1st Qu.:0.00000 1st Qu.:0.00000 1st Qu.:0.000
## Median :0.0000 Median :0.00000 Median :0.00000 Median :0.000
## Mean :0.3218 Mean :0.08265 Mean :0.07638 Mean :0.418
## 3rd Qu.:1.0000 3rd Qu.:0.00000 3rd Qu.:0.00000 3rd Qu.:1.000
## Max. :7.0000 Max. :4.00000 Max. :3.00000 Max. :8.000
##
## OutUseNb RiskArea BonusMalus ClaimInd
## Min. :0.0000 Min. : 1.000 Min. : 50.00 Min. :0.000
## 1st Qu.:0.0000 1st Qu.: 6.000 1st Qu.: 50.00 1st Qu.:0.000
## Median :0.0000 Median : 7.000 Median : 50.00 Median :0.000
## Mean :0.2243 Mean : 7.845 Mean : 57.78 Mean :0.093
## 3rd Qu.:0.0000 3rd Qu.:10.000 3rd Qu.: 60.00 3rd Qu.:0.000
## Max. :5.0000 Max. :13.000 Max. :185.00 Max. :1.000
##
summary(freMPL5$SocioCateg)
## CSP1 CSP17 CSP2 CSP20 CSP21 CSP22 CSP26 CSP3 CSP30 CSP37 CSP38 CSP40 CSP41
## 960 2 78 93 171 127 157 29 1 219 2 74 2
## CSP42 CSP44 CSP45 CSP46 CSP47 CSP48 CSP49 CSP50 CSP51 CSP55 CSP56 CSP57 CSP59
## 655 1 2 651 44 494 78 12385 6 3247 2 9 2
## CSP6 CSP60 CSP61 CSP63 CSP65 CSP66 CSP7
## 171 5646 1 1 14 672 4
On remarque que certaines variables sont numĂ©riques au lieu dâ€™Ăªtre considĂ©rĂ©es comme des facteurs. Nous allons donc les changer :
freMPL5$HasKmLimit <- factor(freMPL5$HasKmLimit)
freMPL5$ClaimInd <- factor(freMPL5$ClaimInd)
freMPL5$OutUseNb <- as.numeric(freMPL5$OutUseNb)
A = kde2d(freMPL5$DrivAge, freMPL5$BonusMalus)
filled.contour(A)
A = kde2d(freMPL5$DrivAge, freMPL5$LicAge)
filled.contour(A)
A = kde2d(freMPL5$DrivAge, freMPL5$Exposure)
filled.contour(A)
A = kde2d(freMPL5$DrivAge, freMPL5$ClaimNbNonResp)
filled.contour(A)
A = kde2d(freMPL5$DrivAge, freMPL5$ClaimNbWindscreen)
filled.contour(A)
A = kde2d(freMPL5$DrivAge, freMPL5$RiskArea)
filled.contour(A)
A = kde2d(freMPL5$BonusMalus, freMPL5$LicAge)
filled.contour(A)
A = kde2d(freMPL5$BonusMalus, freMPL5$Exposure)
filled.contour(A)
A = kde2d(freMPL5$BonusMalus, freMPL5$ClaimNbNonResp)
filled.contour(A)
A = kde2d(freMPL5$BonusMalus, freMPL5$ClaimNbWindscreen)
filled.contour(A)
A = kde2d(freMPL5$BonusMalus, freMPL5$RiskArea)
filled.contour(A)
A = kde2d(freMPL5$LicAge, freMPL5$Exposure)
filled.contour(A)
A = kde2d(freMPL5$LicAge, freMPL5$ClaimNbNonResp)
filled.contour(A)
A = kde2d(freMPL5$LicAge, freMPL5$ClaimNbWindscreen)
filled.contour(A)
A = kde2d(freMPL5$LicAge, freMPL5$RiskArea)
filled.contour(A)
A = kde2d(freMPL5$Exposure, freMPL5$ClaimNbNonResp)
filled.contour(A)
A = kde2d(freMPL5$Exposure, freMPL5$ClaimNbWindscreen)
filled.contour(A)
A = kde2d(freMPL5$Exposure, freMPL5$RiskArea)
filled.contour(A)
A = kde2d(freMPL5$ClaimNbNonResp, freMPL5$ClaimNbWindscreen)
filled.contour(A)
A = kde2d(freMPL5$ClaimNbNonResp, freMPL5$RiskArea)
filled.contour(A)
A = kde2d(freMPL5$ClaimNbWindscreen, freMPL5$RiskArea)
filled.contour(A)
On remarque donc uniquement une réelle forte corrélation entre LicAge et DrivAge pour les variables continues.
#freMPL veut dire French Motor Personal Line datasets
#On utilisera le dataset 5 qui contient environ 26000 contrats de l'annee 2004
x <- freMPL5[, c(1,2,9,11,12,13,14,15,16,17,18,19)]
corrplot(round(cor(x),2),method="ellipse")
#C'est une ACP sur les données quantitatives (on a ici considéré la zone
#comme faisant partie des variables quantitatives car c'est "possibly ordered")
PCA(x)
## **Results for the Principal Component Analysis (PCA)**
## The analysis was performed on 26000 individuals, described by 12 variables
## *The results are available in the following objects:
##
## name description
## 1 "$eig" "eigenvalues"
## 2 "$var" "results for the variables"
## 3 "$var$coord" "coord. for the variables"
## 4 "$var$cor" "correlations variables - dimensions"
## 5 "$var$cos2" "cos2 for the variables"
## 6 "$var$contrib" "contributions of the variables"
## 7 "$ind" "results for the individuals"
## 8 "$ind$coord" "coord. for the individuals"
## 9 "$ind$cos2" "cos2 for the individuals"
## 10 "$ind$contrib" "contributions of the individuals"
## 11 "$call" "summary statistics"
## 12 "$call$centre" "mean of the variables"
## 13 "$call$ecart.type" "standard error of the variables"
## 14 "$call$row.w" "weights for the individuals"
## 15 "$call$col.w" "weights for the variables"
On va ici modifier les données pour faire une ACM.
#On veut ici supprimer les lignes dont les montants réclamés sont négatifs.
#En effet, elles donnent lieu à une régularisation et ne nous apporterons rien.
data(freMPL5)
freMPL5 <- subset(freMPL5, freMPL5$ClaimAmount >= 0)
freMPL5$HasKmLimit <- factor(freMPL5$HasKmLimit)
freMPL5$RiskArea <- factor(freMPL5$RiskArea)
freMPL5$ClaimInd <- factor(freMPL5$ClaimInd)
freMPL5$ClaimNbFireTheft <- factor(freMPL5$ClaimNbFireTheft)
freMPL5$ClaimNbResp <- factor(freMPL5$ClaimNbResp)
freMPL5$ClaimNbNonResp <- factor(freMPL5$ClaimNbNonResp)
freMPL5$ClaimNbParking <- factor(freMPL5$ClaimNbParking)
freMPL5$ClaimNbWindscreen <- factor(freMPL5$ClaimNbWindscreen)
freMPL5$OutUseNb <- factor(freMPL5$OutUseNb)
#Pour faire une analyse de données, nous allons transformer toutes les variables
#quantitatives en variables qualitatives de manière à avoir un nombre homogènes
#d'assurés dans chaque classe.
freMPL5$Exposure <- cut(freMPL5$Exposure, quantile(freMPL5$Exposure, probs = seq(0,1,1/4)), include.lowest = TRUE)
freMPL5$DrivAge <- cut(freMPL5$DrivAge, quantile(freMPL5$DrivAge, probs = seq(0,1,1/6)), include.lowest = TRUE)
freMPL5$LicAge <- cut(freMPL5$LicAge, quantile(freMPL5$LicAge, probs = seq(0,1,1/6)), include.lowest = TRUE)
freMPL5$BonusMalus <- cut(freMPL5$BonusMalus, c(50,54,seq(60, 200, 20)), include.lowest = TRUE)
freMPL5$IntervalCout <- cut(freMPL5$ClaimAmount, c(0,seq(1, 100000, 1000)), include.lowest = TRUE)
Voici la réalisation de l’ACM :
fact <- freMPL5 %>% select_if(is.factor)
#AMC de nos données pour le premier et deuxième axes
res.mca = MCA(fact, ncp = 5, graph = TRUE, axes = c(1,2))
## Warning: ggrepel: 12 unlabeled data points (too many overlaps). Consider
## increasing max.overlaps
print(res.mca)
## **Results of the Multiple Correspondence Analysis (MCA)**
## The analysis was performed on 25722 individuals, described by 18 variables
## *The results are available in the following objects:
##
## name description
## 1 "$eig" "eigenvalues"
## 2 "$var" "results for the variables"
## 3 "$var$coord" "coord. of the categories"
## 4 "$var$cos2" "cos2 for the categories"
## 5 "$var$contrib" "contributions of the categories"
## 6 "$var$v.test" "v-test for the categories"
## 7 "$ind" "results for the individuals"
## 8 "$ind$coord" "coord. for the individuals"
## 9 "$ind$cos2" "cos2 for the individuals"
## 10 "$ind$contrib" "contributions of the individuals"
## 11 "$call" "intermediate results"
## 12 "$call$marge.col" "weights of columns"
## 13 "$call$marge.li" "weights of rows"
# Visualisation des résultats des variables
fviz_mca_var(res.mca, col.var = "cos2",
gradient.cols = c("#00AFBB", "#E7B800", "#FC4E07"),
repel = TRUE,
ggtheme = theme_minimal())
Les variables Ă faible valeur de cos2 seront en blanc, les variables Ă valeur moyenne en bleu et les variables Ă valeur forte en rouge.
#visualisation de l'AMC de nos variables en fonction du 1er et 3eme axes
res.mca_2 = MCA(fact, ncp = 5, graph = TRUE, axes = c(1,3))
## Warning: ggrepel: 10 unlabeled data points (too many overlaps). Consider
## increasing max.overlaps
fviz_mca_var(res.mca_2, col.var = "cos2",
gradient.cols = c("#00AFBB", "#E7B800", "#FC4E07"),
repel = TRUE,
ggtheme = theme_minimal())
#visualisation de l'AMC de nos variables en fonction du 2eme et 3eme axes
res.mca_3 = MCA(fact, ncp = 5, graph = TRUE, axes = c(2,3))
## Warning: ggrepel: 11 unlabeled data points (too many overlaps). Consider
## increasing max.overlaps
fviz_mca_var(res.mca_3, col.var = "cos2",
gradient.cols = c("#00AFBB", "#E7B800", "#FC4E07"),
repel = TRUE,
ggtheme = theme_minimal())
#biplot de notre premier AMC
fviz_mca_biplot(res.mca,
select.ind = list(contrib = 15),
select.var = list(contrib = 15))
plot(res.mca,invisible=c("var","quali.sup","quanti.sup"),cex=0.7)
plot(res.mca,invisible=c("ind","quali.sup","quanti.sup"),cex=0.8)
plot(res.mca,invisible=c("quali.sup","quanti.sup"),cex=0.8)
#Visualisation des résultats pour les variables "RiskArea,ClaimInd, SocioCateg et MariStat"
plotellipses(res.mca,keepvar=c(4,5,17,15))
#Contributions des variables pour les différents axes
fviz_contrib(res.mca, choice = "var", axes = 1, top = 30)
fviz_contrib(res.mca, choice = "var", axes = 2, top = 30)
fviz_contrib(res.mca, choice = "var", axes = 3, top = 30)
#Visualisation de chacun des individus
fviz_mca_ind(res.mca, label="none", habillage = 1,
addEllipses=TRUE, ellipse.level=0.95)
fviz_mca_ind(res.mca, label="none", habillage = 2,
addEllipses=TRUE, ellipse.level=0.95)
fviz_mca_ind(res.mca, label="none", habillage = 3,
addEllipses=TRUE, ellipse.level=0.95)
fviz_mca_ind(res.mca, label="none", habillage = 4,
addEllipses=TRUE, ellipse.level=0.95)
fviz_mca_ind(res.mca, label="none", habillage = 5,
addEllipses=TRUE, ellipse.level=0.95)
## Too few points to calculate an ellipse
## Too few points to calculate an ellipse
## Too few points to calculate an ellipse
## Too few points to calculate an ellipse
## Too few points to calculate an ellipse
## Too few points to calculate an ellipse
## Too few points to calculate an ellipse
## Too few points to calculate an ellipse
## Too few points to calculate an ellipse
## Too few points to calculate an ellipse
fviz_mca_ind(res.mca, label="none", habillage = 6,
addEllipses=TRUE, ellipse.level=0.95)
fviz_mca_ind(res.mca, label="none", habillage = 7,
addEllipses=TRUE, ellipse.level=0.95)
fviz_mca_ind(res.mca, label="none", habillage = 8,
addEllipses=TRUE, ellipse.level=0.95)
fviz_mca_ind(res.mca, label="none", habillage = 9,
addEllipses=TRUE, ellipse.level=0.95)
fviz_mca_ind(res.mca, label="none", habillage = 10,
addEllipses=TRUE, ellipse.level=0.95)
## Too few points to calculate an ellipse
## Too few points to calculate an ellipse
fviz_mca_ind(res.mca, label="none", habillage = 11,
addEllipses=TRUE, ellipse.level=0.95)
fviz_mca_ind(res.mca, label="none", habillage = 12,
addEllipses=TRUE, ellipse.level=0.95)
fviz_mca_ind(res.mca, label="none", habillage = 13,
addEllipses=TRUE, ellipse.level=0.95)
## Too few points to calculate an ellipse
fviz_mca_ind(res.mca, label="none", habillage = 14,
addEllipses=TRUE, ellipse.level=0.95)
fviz_mca_ind(res.mca, label="none", habillage = 15,
addEllipses=TRUE, ellipse.level=0.95)
fviz_mca_ind(res.mca, label="none", habillage = 16,
addEllipses=TRUE, ellipse.level=0.95)
## Too few points to calculate an ellipse
fviz_mca_ind(res.mca, label="none", habillage = 17,
addEllipses=TRUE, ellipse.level=0.95)
fviz_mca_ind(res.mca, label="none", habillage = 18,
addEllipses=TRUE, ellipse.level=0.95)
## Too few points to calculate an ellipse
## Too few points to calculate an ellipse
## Too few points to calculate an ellipse
## Too few points to calculate an ellipse
## Too few points to calculate an ellipse
## Too few points to calculate an ellipse
## Too few points to calculate an ellipse
## Too few points to calculate an ellipse
## Too few points to calculate an ellipse
## Too few points to calculate an ellipse
## Too few points to calculate an ellipse
fviz_mca_ind(res.mca, label="none", habillage = 5,
addEllipses=TRUE, ellipse.level=0.95)
## Too few points to calculate an ellipse
## Too few points to calculate an ellipse
## Too few points to calculate an ellipse
## Too few points to calculate an ellipse
## Too few points to calculate an ellipse
## Too few points to calculate an ellipse
## Too few points to calculate an ellipse
## Too few points to calculate an ellipse
## Too few points to calculate an ellipse
## Too few points to calculate an ellipse